ggplot2


Grammar of Graphics

  • Data & Aesthetics & Geometries & Facets & Statistics & Coordinates & Themes

Data

#setwd("/home/creatrol/ws/R/Tutorials")
library(ggplot2)
library(dplyr)
library(tidyr)

factors

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
ggplot(mtcars, aes(x = cyl, y = mpg)) + 
  geom_point()          # considered as numeric

ggplot(mtcars, aes(x = factor(cyl), y = mpg)) + 
  geom_point()          # considered as factor


Point color & size & shape

# class(mtcars$disp)   is numeric
# color
ggplot(mtcars, aes(x = wt, y = mpg, col = disp)) + 
  geom_point()

# size
ggplot(mtcars, aes(x = wt, y = mpg, size = disp))+
  geom_point()

# shape           -- can not be used to continuous variable
ggplot(mtcars, aes(x = wt, y = mpg, shape = factor(cyl))) +
  geom_point()


Point multiple variables

  • not a good way
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +
  geom_point()+
  geom_point(aes(x = Petal.Length, y = Petal.Width), col = "red")

  • the right way is to tidy data and use “col =”

Point Plots with groups (facet_grid)

  • consider about the relation between 4 variables
# tidy data:
iris.tidy <- gather(iris, Part.Measure, Value, -Species)
iris.tidy2 <- separate(iris.tidy, Part.Measure, c("Part", "Measure"), sep = "\\.")
#iris.tidy2 <- iris.tidy %>% mutate(Part.Measure = gsub(pattern = "\\.", replacement = "_", x = Part.Measure)) %>% 
#  separate(Part.Measure, into = c("Part", "Measure"), sep= "_")
iris.tidy2$row <- 1:nrow(iris.tidy2)
iris.tidy <- spread(iris.tidy2, Species, Value)
iris.tidy2 <- iris.tidy2[,1:4]
# plot -- a hard way
str(iris.tidy)
## 'data.frame':    600 obs. of  6 variables:
##  $ Part      : chr  "Petal" "Petal" "Petal" "Petal" ...
##  $ Measure   : chr  "Length" "Length" "Length" "Length" ...
##  $ row       : int  301 302 303 304 305 306 307 308 309 310 ...
##  $ setosa    : num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ versicolor: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ virginica : num  NA NA NA NA NA NA NA NA NA NA ...
ggplot(iris.tidy, aes(x = Part, y = setosa, col = Measure)) +
  geom_jitter()
## Warning: Removed 400 rows containing missing values (geom_point).

ggplot(iris.tidy, aes(x = Part, y = versicolor, col = Measure)) +
  geom_jitter()
## Warning: Removed 400 rows containing missing values (geom_point).

ggplot(iris.tidy, aes(x = Part, y = virginica, col = Measure)) +
  geom_jitter()
## Warning: Removed 400 rows containing missing values (geom_point).

# plot -- a easy way
str(iris.tidy2)
## 'data.frame':    600 obs. of  4 variables:
##  $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Part   : chr  "Sepal" "Sepal" "Sepal" "Sepal" ...
##  $ Measure: chr  "Length" "Length" "Length" "Length" ...
##  $ Value  : num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
ggplot(iris.tidy2, aes(x = Part, y = Value, col = Measure)) +
  geom_jitter() + 
  facet_grid(. ~ Species)


Aesthetics

Aesthetic Description
x X axis position
y Y axis position
colour Colour of dots, outlines of other shapes
fill Fill color
size Diameter of points, thickness of lines
alpha Transparency
linetype Line dash pattern
labels Text on a plot or axes
shape Shape of points
  • Shape

ggplot2-shape

Empty inside can be “fill”


  • linetype

ggplot2-linetype ggplot2-linetype2


  • size

ggplot2-size


  • ggplot2-colour

ggplot2-colour


  • ggplot2-palette

ggplot2-palette

used for scale_color_brewer & scale_fill_brewer & scale_colour_distiller & scale_fill_distiller (palette = “”)


Modifying Aesthetics

  • Positions :

    • identity(default), dodge, stack, fill, jitter, jitterdodge

sample : jitter

posn.j <- position_jitter(width = 0.1)  # add random noise to raw points
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
  geom_point(position = posn.j)

sample : stack & fill & dodge

cyl.am <- ggplot(mtcars, aes(x = factor(cyl), fill = factor(am)))
cyl.am + geom_bar()

cyl.am + geom_bar(position = "stack")

cyl.am + geom_bar(position = "fill")

cyl.am + geom_bar(position = "dodge")


  • Scale Functions:

    • scale_x_continuous, scale_y, scale_color_discrete, scale_fill, scale_shape, scale_color, scale_linetype

sample :

ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) +
  geom_point(position = "jitter") +
  scale_x_continuous("continuous Sepal Length", limits = c(2,8), 
                     breaks = seq(2,8,3), expand = c(0,0)) +
  scale_color_discrete("Species", 
                       labels = c("Setosa1", "Versicolour2", "Virginica3")) # can use labs() to do the same thing


Geometries

Common Plot types

  • Scatter plots
    • points, jitter, abline
  • Bar plots
    • histogram, bar, errorbar
  • Line plots
    • line

Scatter Plot

Optional : alpha, col, fill, shape, size

  • aes() inside geom()
    • control aesthetic mappings of each layer independently
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) + 
  geom_point(aes(col = Species)) + 
  # change palette by manual
  scale_color_manual(values = c("red","blue", "green"))

benefit here

# summary statistics
iris.summary <- aggregate(iris[1:4], list(iris$Species), mean)
names(iris.summary)[1] <- "Species"
# plot Add layers
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + 
  geom_point()+
  # add points from other data
  geom_point(data = iris.summary, shape = 21, size = 5, fill = "black") +
  # add lines from other data
  geom_vline(data = iris.summary, linetype = 2,
             aes(xintercept = Sepal.Length, col = Species)) + 
  geom_hline(data = iris.summary, linetype = 2,
             aes(yintercept = Sepal.Width, col = Species))


Bar Plots

  • Histogram
ggplot(iris, aes(x = Sepal.Width)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

diff(range(iris$Sepal.Width)) / 30
## [1] 0.08
# change the bin width
ggplot(iris, aes(x = Sepal.Width)) +
  geom_histogram(binwidth = 0.1)

# change to density
ggplot(iris, aes(x = Sepal.Width)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.1)

# fill color & position
ggplot(iris, aes(x = Sepal.Width, fill = Species)) +
  geom_histogram(binwidth = 0.1, position = "dodge")

# position
posn.d <- position_dodge(width = 6)
ggplot(iris, aes(x = Sepal.Width, fill = Species)) +
  geom_histogram(binwidth = 0.1, position = posn.d)

# density line
ggplot(iris, aes(x = Sepal.Width, col = Species)) +
  geom_freqpoly(aes(y = ..density..), binwidth = 0.1)

# change palette
ggplot(iris, aes(x = Sepal.Width, fill = Species)) +
   geom_bar() +
   scale_fill_brewer(palette = "Set1")

# change palette by manual
ggplot(iris, aes(x = Sepal.Width, fill = Species)) +
   geom_bar() +
   scale_fill_manual(values = c("red","blue", "green"))

# change palette by manual
ggplot(iris, aes(x = Sepal.Width, fill = Species)) +
   geom_bar() +
   scale_color_manual(values = c("red","blue", "green"))

  • Bar
ggplot(iris, aes(Sepal.Width)) + geom_bar()

# Distribution Bar Plots
library(plyr); library(reshape2)
iris_melted <- melt(iris, value.name = "Value",
                    variable.name = "Measure")
iris_summ <- ddply(iris_melted[iris_melted$Measure == "Sepal.Width",],
                   "Species", summarise, avg = mean(Value), stdev = sd(Value))
str(iris_summ)
## 'data.frame':    3 obs. of  3 variables:
##  $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 2 3
##  $ avg    : num  3.43 2.77 2.97
##  $ stdev  : num  0.379 0.314 0.322
  • Errorbar
ggplot(iris_summ, aes(x = Species, y = avg)) +
  # here tell geom_bar not to count
  geom_bar(stat = "identity", fill = "grey50") +
  # add error bar
  geom_errorbar(aes(ymin = avg - stdev, ymax = avg + stdev),
                width = 0.2)


Lineplots - TimeSeries
str(beaver1)
## 'data.frame':    114 obs. of  4 variables:
##  $ day  : num  346 346 346 346 346 346 346 346 346 346 ...
##  $ time : num  840 850 900 910 920 930 940 950 1000 1010 ...
##  $ temp : num  36.3 36.3 36.4 36.4 36.5 ...
##  $ activ: num  0 0 0 0 0 0 0 0 0 0 ...
ggplot(beaver1, aes(x = time, y = temp, col = factor(activ))) +
  geom_line()

  • linetype
ggplot(beaver1, aes(x = time, y = temp, linetype = factor(activ))) +
  geom_line()

  • size
ggplot(beaver1, aes(x = time, y = temp, size = factor(activ))) +
  geom_line()
## Warning: Using size for a discrete variable is not advised.

  • fill
ggplot(beaver1, aes(x = time, y = temp, fill = factor(activ))) +
  geom_area(position = "fill")

  • geom_ribbon
ggplot(beaver1, aes(x = time, y = temp, fill = factor(activ))) +
  geom_ribbon(aes(ymax = temp, ymin = 0), alpha = 0.3)

  • geom_rect
ggplot(beaver1, aes(x = time, y = temp, col = factor(activ))) +
  geom_rect(aes(xmin = 500, xmax =1500, ymin = 36.6, ymax = 37.2), 
            color="grey20", fill = "red", alpha = 0.2, inherit.aes = FALSE) +
  geom_line() 


Statistics

Stats and Geoms

  • bin
p <- ggplot(iris, aes(x = Sepal.Width))
p + geom_histogram()

p + geom_bar()

p + stat_bin()

  • smooth
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + 
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + 
  geom_point() +
  stat_smooth(method = "lm", 
              span = 0.7 #the size of this window,weighted, sliding-window, average to calculate a line of best fit 
              ) +
  stat_quantile()
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

  • quantile
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + 
  geom_point() +
  stat_quantile()
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

  • sum
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + 
  geom_point() +
  stat_sum()


Stats outside Geoms

  • stat_summary
ggplot(iris, aes(x = Species, y = Sepal.Length)) + 
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1))

ggplot(iris, aes(x = Species, y = Sepal.Length)) + 
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1),
               geom = "errorbar", width = 0.1)

# bar -- not recommend
ggplot(iris, aes(x = Species, y = Sepal.Length)) + 
  stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1),
               geom = "errorbar", width = 0.1)

  • stat_function
library(MASS) 
mam.new <- data.frame(body = log10(mammals$body))
ggplot(mam.new, aes(x = body)) +
  geom_histogram(aes(y = ..density..)) +
  geom_rug() +
  stat_function(fun = dnorm, colour = "red",
                args = list(mean = mean(mam.new$body),
                           sd = sd(mam.new$body)))


Coordinates and Facets

Coordinates Layer

  • Zooming in + scale_x_continuous (limits = …) + xlim() + coord_cartesian(xlim = …)
# Original Plot
iris.smooth <- ggplot(iris, aes(x = Sepal.Length,
                                y = Sepal.Width, col = Species)) +
  geom_point(alpha = 0.7) + geom_smooth()
iris.smooth

# scale_x_continuous
iris.smooth + scale_x_continuous(limits = c(4.5, 5.5))
## Warning: Removed 95 rows containing non-finite values (stat_smooth).
## Warning: Removed 95 rows containing missing values (geom_point).

# xlim
iris.smooth + xlim(c(4.5, 5.5))
## Warning: Removed 95 rows containing non-finite values (stat_smooth).

## Warning: Removed 95 rows containing missing values (geom_point).

# coord_cartesian
iris.smooth + coord_cartesian(xlim = c(4.5, 5.5))

Facet format

iris.smooth <- ggplot(iris, aes(x = Sepal.Length,
                                y = Sepal.Width, col = Species)) +
  geom_line(alpha = 0.7) + geom_smooth()
iris.smooth + coord_equal() # a 1:1 aspect ratio

# fixed ratio
iris.smooth + coord_fixed(0.05)

# pie chart
ggplot(iris, aes(x = Sepal.Length, col = Species)) + 
  stat_bin() + coord_polar()


Themes

Themes from Scratch

  • element_blank
ggplot(iris, aes(x = Sepal.Length, col = Species)) + 
  stat_bin() + coord_polar() +
  theme(text = element_blank(),
        rect = element_blank())

  • theme

ggplot2-linetype

  • text

ggplot2-linetype

  • statFunction

ggplot2-linetype

  • line & axis

ggplot2-linetype

  • inheritance

ggplot2-linetype

Recycling Specific Themes

  • theme_update & theme_set

ggplot2-linetype